library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(modelr)
library(ggplot2)
load the oj data
oj <- read_csv('oj.csv')
## Parsed with column specification:
## cols(
## store = col_integer(),
## brand = col_character(),
## week = col_integer(),
## logmove = col_double(),
## feat = col_integer(),
## price = col_double(),
## AGE60 = col_double(),
## EDUC = col_double(),
## ETHNIC = col_double(),
## INCOME = col_double(),
## HHLARGE = col_double(),
## WORKWOM = col_double(),
## HVAL150 = col_double(),
## SSTRDIST = col_double(),
## SSTRVOL = col_double(),
## CPDIST5 = col_double(),
## CPWVOL5 = col_double()
## )
oj
## # A tibble: 28,947 × 17
## store brand week logmove feat price AGE60 EDUC
## <int> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 2 tropicana 40 9.018695 0 3.87 0.2328647 0.2489349
## 2 2 tropicana 46 8.723231 0 3.87 0.2328647 0.2489349
## 3 2 tropicana 47 8.253228 0 3.87 0.2328647 0.2489349
## 4 2 tropicana 48 8.987197 0 3.87 0.2328647 0.2489349
## 5 2 tropicana 50 9.093357 0 3.87 0.2328647 0.2489349
## 6 2 tropicana 51 8.877382 0 3.87 0.2328647 0.2489349
## 7 2 tropicana 52 9.294682 0 3.29 0.2328647 0.2489349
## 8 2 tropicana 53 8.954674 0 3.29 0.2328647 0.2489349
## 9 2 tropicana 54 9.049232 0 3.29 0.2328647 0.2489349
## 10 2 tropicana 57 8.613230 0 3.29 0.2328647 0.2489349
## # ... with 28,937 more rows, and 9 more variables: ETHNIC <dbl>,
## # INCOME <dbl>, HHLARGE <dbl>, WORKWOM <dbl>, HVAL150 <dbl>,
## # SSTRDIST <dbl>, SSTRVOL <dbl>, CPDIST5 <dbl>, CPWVOL5 <dbl>
#plot(oj)
names(oj)
## [1] "store" "brand" "week" "logmove" "feat" "price"
## [7] "AGE60" "EDUC" "ETHNIC" "INCOME" "HHLARGE" "WORKWOM"
## [13] "HVAL150" "SSTRDIST" "SSTRVOL" "CPDIST5" "CPWVOL5"
summary(oj)
## store brand week logmove
## Min. : 2.00 Length:28947 Min. : 40.0 Min. : 4.159
## 1st Qu.: 53.00 Class :character 1st Qu.: 70.0 1st Qu.: 8.490
## Median : 86.00 Mode :character Median :101.0 Median : 9.034
## Mean : 80.88 Mean :100.5 Mean : 9.168
## 3rd Qu.:111.00 3rd Qu.:130.0 3rd Qu.: 9.765
## Max. :137.00 Max. :160.0 Max. :13.482
## feat price AGE60 EDUC
## Min. :0.0000 Min. :0.520 Min. :0.05805 Min. :0.04955
## 1st Qu.:0.0000 1st Qu.:1.790 1st Qu.:0.12210 1st Qu.:0.14598
## Median :0.0000 Median :2.170 Median :0.17065 Median :0.22939
## Mean :0.2373 Mean :2.282 Mean :0.17313 Mean :0.22522
## 3rd Qu.:0.0000 3rd Qu.:2.730 3rd Qu.:0.21395 3rd Qu.:0.28439
## Max. :1.0000 Max. :3.870 Max. :0.30740 Max. :0.52836
## ETHNIC INCOME HHLARGE WORKWOM
## Min. :0.02425 Min. : 9.867 Min. :0.01351 Min. :0.2445
## 1st Qu.:0.04191 1st Qu.:10.456 1st Qu.:0.09794 1st Qu.:0.3126
## Median :0.07466 Median :10.635 Median :0.11122 Median :0.3556
## Mean :0.15556 Mean :10.617 Mean :0.11560 Mean :0.3592
## 3rd Qu.:0.18776 3rd Qu.:10.797 3rd Qu.:0.13517 3rd Qu.:0.4023
## Max. :0.99569 Max. :11.236 Max. :0.21635 Max. :0.4723
## HVAL150 SSTRDIST SSTRVOL CPDIST5
## Min. :0.002509 Min. : 0.1321 Min. :0.4000 Min. :0.7725
## 1st Qu.:0.123486 1st Qu.: 2.7670 1st Qu.:0.7273 1st Qu.:1.6262
## Median :0.346154 Median : 4.6507 Median :1.1154 Median :1.9634
## Mean :0.343766 Mean : 5.0973 Mean :1.2073 Mean :2.1204
## 3rd Qu.:0.528313 3rd Qu.: 6.6506 3rd Qu.:1.5385 3rd Qu.:2.5337
## Max. :0.916700 Max. :17.8560 Max. :2.5714 Max. :4.1079
## CPWVOL5
## Min. :0.09456
## 1st Qu.:0.27167
## Median :0.38323
## Mean :0.43891
## 3rd Qu.:0.56024
## Max. :1.14337
ggplot(oj, aes(x = log(price), y = logmove, color=brand)) +
geom_point()

# geom_line(oj, aes(x = log(price), y = logmove))
ggplot(oj, aes(x = log(price), y = logmove)) +
geom_point()

logmove <- oj$logmove
price_log <- log(oj$price)
summary(logmove)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.159 8.490 9.034 9.168 9.765 13.480
summary(price_log)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.6539 0.5822 0.7747 0.7841 1.0040 1.3530
qplot(logmove, price_log)

regress logmove on log(price)
model1 <- lm(logmove ~ log(price), oj)
summary(model1)
##
## Call:
## lm(formula = logmove ~ log(price), data = oj)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.0441 -0.5853 -0.0330 0.5756 3.7264
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.42342 0.01535 679.04 <2e-16 ***
## log(price) -1.60131 0.01836 -87.22 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9071 on 28945 degrees of freedom
## Multiple R-squared: 0.2081, Adjusted R-squared: 0.2081
## F-statistic: 7608 on 1 and 28945 DF, p-value: < 2.2e-16
brand <- oj$brand
qplot(brand, price_log)

qplot(logmove, data = oj, geom = "bar")

c <- ggplot(data=oj, group=factor(brand)) +
geom_bar(aes(brand))
c

qplot(logmove, brand)

regress logmove on log(price) with brand interactions
model2 <- lm(logmove ~ log(price)*brand, oj)
summary(model2)
##
## Call:
## lm(formula = logmove ~ log(price) * brand, data = oj)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.4434 -0.5232 -0.0494 0.4884 3.4901
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.95468 0.02070 529.136 <2e-16 ***
## log(price) -3.37753 0.03619 -93.322 <2e-16 ***
## brandminute.maid 0.88825 0.04155 21.376 <2e-16 ***
## brandtropicana 0.96239 0.04645 20.719 <2e-16 ***
## log(price):brandminute.maid 0.05679 0.05729 0.991 0.322
## log(price):brandtropicana 0.66576 0.05352 12.439 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7911 on 28941 degrees of freedom
## Multiple R-squared: 0.3978, Adjusted R-squared: 0.3977
## F-statistic: 3823 on 5 and 28941 DF, p-value: < 2.2e-16
model2.1 <- lm(logmove ~ log(price):brand, oj)
summary(model2.1)
##
## Call:
## lm(formula = logmove ~ log(price):brand, data = oj)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.7859 -0.5188 -0.0570 0.4840 3.5856
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.29172 0.01668 676.8 <2e-16 ***
## log(price):branddominicks -3.92032 0.03042 -128.9 <2e-16 ***
## log(price):brandminute.maid -2.65843 0.02243 -118.5 <2e-16 ***
## log(price):brandtropicana -2.13001 0.01734 -122.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.801 on 28943 degrees of freedom
## Multiple R-squared: 0.3827, Adjusted R-squared: 0.3826
## F-statistic: 5981 on 3 and 28943 DF, p-value: < 2.2e-16
model2.2 <- lm(logmove ~ log(price) + brand, oj)
summary(model2.2)
##
## Call:
## lm(formula = logmove ~ log(price) + brand, data = oj)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3152 -0.5246 -0.0502 0.4929 3.5088
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.82882 0.01453 745.04 <2e-16 ***
## log(price) -3.13869 0.02293 -136.89 <2e-16 ***
## brandminute.maid 0.87017 0.01293 67.32 <2e-16 ***
## brandtropicana 1.52994 0.01631 93.81 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7935 on 28943 degrees of freedom
## Multiple R-squared: 0.3941, Adjusted R-squared: 0.394
## F-statistic: 6275 on 3 and 28943 DF, p-value: < 2.2e-16
plot(model2.2)




model2.3 <- lm(logmove ~ log(price) + log(price):brand, oj)
summary(model2.3)
##
## Call:
## lm(formula = logmove ~ log(price) + log(price):brand, data = oj)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.7859 -0.5188 -0.0570 0.4840 3.5856
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.29172 0.01668 676.76 <2e-16 ***
## log(price) -3.92032 0.03042 -128.88 <2e-16 ***
## log(price):brandminute.maid 1.26188 0.01873 67.38 <2e-16 ***
## log(price):brandtropicana 1.79031 0.01979 90.45 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.801 on 28943 degrees of freedom
## Multiple R-squared: 0.3827, Adjusted R-squared: 0.3826
## F-statistic: 5981 on 3 and 28943 DF, p-value: < 2.2e-16
plot(model2.3)



